In [1]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model

from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.cluster import KMeans, DBSCAN
from sklearn import metrics


from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import metrics

import plotly.figure_factory as ff
In [2]:
# Load dataset
merged_data = pd.read_csv('merged_train.csv')
In [3]:
# Visualize dataset
merged_data.head()
Out[3]:
State County FIPS Total Population Percent White, not Hispanic or Latino Percent Black, not Hispanic or Latino Percent Hispanic or Latino Percent Foreign Born Percent Female Percent Age 29 and Under Percent Age 65 and Older Median Household Income Percent Unemployed Percent Less than High School Degree Percent Less than Bachelor's Degree Percent Rural Democratic Republican Party
0 AZ apache 4001 72346 18.571863 0.486551 5.947806 1.719515 50.598513 45.854643 13.322091 32460 15.807433 21.758252 88.941063 74.061076 16298 7810 1
1 AZ cochise 4003 128177 56.299492 3.714395 34.403208 11.458374 49.069646 37.902276 19.756275 45383 8.567108 13.409171 76.837055 36.301067 17383 26929 0
2 AZ coconino 4005 138064 54.619597 1.342855 13.711033 4.825298 50.581614 48.946141 10.873943 51106 8.238305 11.085381 65.791439 31.466066 34240 19249 1
3 AZ gila 4007 53179 63.222325 0.552850 18.548675 4.249798 50.296170 32.238290 26.397638 40593 12.129932 15.729958 82.262624 41.062000 7643 12180 0
4 AZ graham 4009 37529 51.461536 1.811932 32.097844 4.385942 46.313518 46.393456 12.315809 47422 14.424104 14.580797 86.675944 46.437399 3368 6870 0
In [ ]:
 
In [4]:
# Task 1
In [5]:
# Partition dataset into training and validationg sets using 80-20 split
x_train, x_val, y_train, y_val = train_test_split(merged_data.drop(['State', 'County', 'FIPS', 'Democratic', 'Republican', 'Party'], axis=1), merged_data[['Democratic', 'Republican', 'Party']], test_size=0.2, random_state=0)
In [6]:
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)
(956, 13)
(956, 3)
(239, 13)
(239, 3)
In [7]:
# Visualize training set: predictor variables
x_train.head()
Out[7]:
Total Population Percent White, not Hispanic or Latino Percent Black, not Hispanic or Latino Percent Hispanic or Latino Percent Foreign Born Percent Female Percent Age 29 and Under Percent Age 65 and Older Median Household Income Percent Unemployed Percent Less than High School Degree Percent Less than Bachelor's Degree Percent Rural
1090 63355 94.057296 1.314813 1.534212 1.067003 48.011996 36.140794 16.156578 52657 4.187743 7.879569 80.721503 46.142754
103 33461 91.802397 0.878635 5.905382 2.459580 50.521503 38.928902 15.600251 55485 6.384942 13.001171 83.457969 68.038712
745 7131 35.520965 2.131538 61.716449 20.782499 46.893844 45.982331 14.289721 40589 2.109447 32.383600 89.251795 28.750872
1048 58963 94.454149 0.746231 1.831657 3.578515 50.594441 34.287604 17.003884 58171 4.603880 6.360298 59.539254 52.805120
1186 8572 91.565562 0.443304 2.134858 1.656556 46.966869 32.571162 20.496967 54594 4.512276 5.105750 74.537343 49.025557
In [8]:
# Visualize training set: labels
y_train.head()
Out[8]:
Democratic Republican Party
1090 13798 13830 0
103 3278 6987 0
745 405 1202 0
1048 19395 6517 1
1186 722 3085 0
In [ ]:
 
In [9]:
# Task 2
In [10]:
# Standardize the training set and the validation set
scaler = StandardScaler() 
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train) 
x_val_scaled = scaler.transform(x_val)
In [ ]:
 
In [12]:
# Task 3
In [13]:
# Predicting the number of votes cast for the DEMOCRATIC party in each county
In [14]:
# Building Model using all variables
In [15]:
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled, y_train['Democratic'])
print(fitted_model.coef_)
[ 67699.4769908   -3218.35778818  -1056.15862014  -6992.46123859
   3935.07911344     96.93882291  -4964.31964474  -1591.01933815
   1442.29141648   1302.93524322   3966.55100572 -10069.94516214
    -97.88148769]
In [15]:
predicted = fitted_model.predict(x_val_scaled)

corr_coef = numpy.corrcoef(predicted, y_val['Democratic'])[1, 0] 
R_squared = corr_coef**2
print(R_squared)
0.9312193337733043
In [ ]:
 
In [16]:
# Building Model using the top 4 most significant variables (based on project01 report)
In [17]:
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled[:, [0, 1, 2, 11]], y_train['Democratic'])
print(fitted_model.coef_)
[69256.63509865   752.00100403  2017.87432223 -8892.08450096]
In [18]:
predicted = fitted_model.predict(x_val_scaled[:, [0, 1, 2, 11]])

corr_coef = numpy.corrcoef(predicted, y_val['Democratic'])[1, 0] 
R_squared = corr_coef**2
print(R_squared)
0.948509180896764
In [ ]:
 
In [19]:
# Removing seemingly least relevant variable (BEST MODEL FOR PREDICTING DEMOCRATIC VOTES)
In [20]:
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled[:, [0, 2, 11]], y_train['Democratic'])
print(fitted_model.coef_)
[69073.08206907  1700.49107316 -8948.82185095]
In [21]:
predicted = fitted_model.predict(x_val_scaled[:, [0, 2, 11]])

corr_coef = numpy.corrcoef(predicted, y_val['Democratic'])[1, 0] 
R_squared = corr_coef**2
print(R_squared)
0.949133942937187
In [ ]:
 
In [22]:
# Adding multiple other variables to see if R squared can be increased
In [23]:
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled[:, [0, 2, 5, 9, 11, 12]], y_train['Democratic'])
print(fitted_model.coef_)
[ 69323.93712355   1401.31874583   -418.56897963   1840.06113202
 -10276.39011065   1654.62454387]
In [24]:
predicted = fitted_model.predict(x_val_scaled[:, [0, 2, 5, 9, 11, 12]])

corr_coef = numpy.corrcoef(predicted, y_val['Democratic'])[1, 0] 
R_squared = corr_coef**2
print(R_squared)
0.9494594555838366
In [ ]:
 
In [25]:
# Predicting the number of votes cast for the REPUBLICAN party in each county
In [26]:
# Building Model using all variables
In [27]:
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled, y_train['Republican'])
print(fitted_model.coef_)
[44403.15240906  1780.93181212 -2916.59301756  1217.16109357
 -6262.9194413  -1079.28398492  -772.9612049   2630.69368215
  5930.71380695  2062.35379292  3451.83253914 -2960.02130754
 -5698.28690602]
In [28]:
predicted = fitted_model.predict(x_val_scaled)

corr_coef = numpy.corrcoef(predicted, y_val['Republican'])[1, 0] 
R_squared = corr_coef**2
print(R_squared)
0.6948219003493747
In [ ]:
 
In [29]:
# Building Model using the top 4 most significant variables (based on project01 report)
In [30]:
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled[:, [0, 1, 2, 11]], y_train['Republican'])
print(fitted_model.coef_)
[43826.27499765  2203.84789557 -1872.92686709 -4208.30811836]
In [31]:
predicted = fitted_model.predict(x_val_scaled[:, [0, 1, 2, 11]])

corr_coef = numpy.corrcoef(predicted, y_val['Republican'])[1, 0] 
R_squared = corr_coef**2
print(R_squared)
0.6474596835891537
In [ ]:
 
In [32]:
# Removing seemingly least relevant variable
In [33]:
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled[:, [0, 1, 11]], y_train['Republican'])
print(fitted_model.coef_)
[43640.78531753  3033.48577167 -4146.42294453]
In [34]:
predicted = fitted_model.predict(x_val_scaled[:, [0, 1, 11]])

corr_coef = numpy.corrcoef(predicted, y_val['Republican'])[1, 0] 
R_squared = corr_coef**2
print(R_squared)
0.6499687277390198
In [ ]:
 
In [35]:
# Adding more variables that seem relevant to republican counties to see if R squared can be increased
In [36]:
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled[:, [0, 1, 7, 8, 12]], y_train['Republican'])
print(fitted_model.coef_)
[42791.44622898  3446.69948382  2495.11631892  4028.26822585
 -4816.50015819]
In [37]:
predicted = fitted_model.predict(x_val_scaled[:, [0, 1, 7, 8, 12]])

corr_coef = numpy.corrcoef(predicted, y_val['Republican'])[1, 0] 
R_squared = corr_coef**2
print(R_squared)
0.6737301771404883
In [ ]:
 
In [38]:
# Removing some variables to reduce overfitting (BEST model for predicting Republican)
In [39]:
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled[:, [0, 1, 12]], y_train['Republican'])
print(fitted_model.coef_)
[43470.26429267  4715.26381654 -5038.14337356]
In [40]:
predicted = fitted_model.predict(x_val_scaled[:, [0, 1, 12]])

corr_coef = numpy.corrcoef(predicted, y_val['Republican'])[1, 0] 
R_squared = corr_coef**2
print(R_squared)
0.6692570988078124
In [ ]:
 
In [ ]:
 
In [41]:
#Task 4
In [42]:
#Decision Trees using different criterions
#Classify into two parties so only get the parties for each variable
#Using only the 4 significant variables found in project 1 
classifier_y_train = y_train['Party']
classifier_y_test = y_val['Party']
In [43]:
#EntropyClassifier
Eclassifier = DecisionTreeClassifier(criterion="entropy")
Eclassifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
Out[43]:
DecisionTreeClassifier(criterion='entropy')
In [44]:
y_pred = Eclassifier.predict(x_val_scaled[:, [0, 1, 2, 11]])
In [45]:
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Entropy Decision Tree Confusion Matrix')
plt.tight_layout()
In [46]:
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
[0.7656903765690377, 0.2343096234309623, array([0.86227545, 0.54166667]), array([0.81355932, 0.62903226]), array([0.8372093 , 0.58208955])]
In [47]:
Gclassifier = DecisionTreeClassifier(criterion="gini")
Gclassifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
Out[47]:
DecisionTreeClassifier()
In [48]:
y_pred = Gclassifier.predict(x_val_scaled[:, [0, 1, 2, 11]])
In [49]:
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Gini Decision Tree Confusion Matrix')
plt.tight_layout()
In [50]:
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
[0.7615062761506276, 0.2384937238493724, array([0.83707865, 0.54098361]), array([0.84180791, 0.53225806]), array([0.83943662, 0.53658537])]
In [51]:
#Entropy is slightly better here because even though Gini has a better accuracy by 0.01 it has a F1 score less by 0.03 
In [52]:
#K-nearest Neighbors
#Chose the best number of neighbors based of accuracy
neighbors = [2,3,5,10,15,25,50]
prevAccuracy = 0.0
myN = 0
for n in neighbors:
    classifier = KNeighborsClassifier(n_neighbors = n)
    classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
    y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])
    accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
    if(accuracy > prevAccuracy):
        prevAccuracy = accuracy
        myN = n

print(myN)    
3
In [53]:
#Best model is with 3 neighbors
#Compute statistics
classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])
In [54]:
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('K=3 nearest neigbhors Confusion Matrix')
plt.tight_layout()
In [55]:
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
[0.8326359832635983, 0.16736401673640167, array([0.88268156, 0.68333333]), array([0.89265537, 0.66129032]), array([0.88764045, 0.67213115])]
In [56]:
#SVMs
#First try to find the best kernel 
classifier = SVC(kernel = 'linear')
classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])

conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('SVM linear kernel Confusion Matrix')
plt.tight_layout()
In [57]:
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
[0.7907949790794979, 0.20920502092050208, array([0.80382775, 0.7       ]), array([0.94915254, 0.33870968]), array([0.87046632, 0.45652174])]
In [58]:
#SVMs
#First try to find the best kernel 
classifier = SVC(kernel = 'poly')
classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])

conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('SVM poly kernel Confusion Matrix')
plt.tight_layout()
In [59]:
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
[0.8158995815899581, 0.18410041841004188, array([0.81220657, 0.84615385]), array([0.97740113, 0.35483871]), array([0.88717949, 0.5       ])]
In [60]:
#SVMs
#First try to find the best kernel 
classifier = SVC(kernel = 'rbf')
classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])

conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('SVM rbf kernel Confusion Matrix')
plt.tight_layout()
In [61]:
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
[0.8284518828451883, 0.17154811715481166, array([0.83663366, 0.78378378]), array([0.95480226, 0.46774194]), array([0.89182058, 0.58585859])]
In [62]:
#SVMs
#First try to find the best kernel 
classifier = SVC(kernel = 'sigmoid')
classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])

conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('SVM sigmoid kernel Confusion Matrix')
plt.tight_layout()
In [63]:
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
[0.6610878661087866, 0.33891213389121344, array([0.78571429, 0.36619718]), array([0.74576271, 0.41935484]), array([0.76521739, 0.39097744])]
In [64]:
# Rbf is the best kernel
In [65]:
# Now try to find the best c value
# based on accuracy 
values = [0.001,0.01,0.1,0.5,1.0,5.0,10]
prevAcc = 0.0
cVal = -1
for v in values:
    classifier = SVC(kernel = 'rbf', C = v)
    classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
    y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])
    accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
    if(accuracy > prevAcc):
        prevAcc = accuracy
        cVal = v
print(cVal)
5.0
In [66]:
#best c val is 5.0 
# get metrics
classifier = SVC(kernel = 'rbf',C=5.0)
classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])

conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('SVM rbf,c=5.0 kernel Confusion Matrix')
plt.tight_layout()
In [67]:
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
[0.8368200836820083, 0.16317991631799167, array([0.845     , 0.79487179]), array([0.95480226, 0.5       ]), array([0.89655172, 0.61386139])]
In [68]:
#Now that we found the optimal models for each type of classifier
#lets try a different combination of variables for the best models for each classifier
#combination: Percent under age 25,median household income, percent unemployed, percent less than bach degree, percent roral
# [:,[6,8,9,11,12]]
In [69]:
#Best decision tree model on new combination
Eclassifier = DecisionTreeClassifier(criterion="entropy")
Eclassifier.fit(x_train_scaled[:,[6,8,9,11,12]],classifier_y_train)
y_pred = Eclassifier.predict(x_val_scaled[:,[6,8,9,11,12]])
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Entropy Decision , newCombo Tree Confusion Matrix')
plt.tight_layout()
In [70]:
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
[0.7364016736401674, 0.2635983263598326, array([0.83928571, 0.49295775]), array([0.79661017, 0.56451613]), array([0.8173913 , 0.52631579])]
In [71]:
#The new decision tree is much worse than the previous combination 
In [72]:
#Create K -nearest neighbors model with new combination 
classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(x_train_scaled[:,[6,8,9,11,12]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:,[6,8,9,11,12]])
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('3 neighbors classifier, new Combo Confusion Matrix')
plt.tight_layout()
In [73]:
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
[0.7531380753138075, 0.2468619246861925, array([0.80729167, 0.53191489]), array([0.87570621, 0.40322581]), array([0.8401084, 0.4587156])]
In [74]:
# K nearest neighbors also is worse with the new combination of variables 
In [75]:
#Create SVM with new combination of variables 
In [76]:
classifier = SVC(kernel = 'rbf',C=5.0)
classifier.fit(x_train_scaled[:,[6,8,9,11,12]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:,[6,8,9,11,12]])

conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('SVM rbf,c=5.0, new Combo Confusion Matrix')
plt.tight_layout()
In [77]:
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
[0.803347280334728, 0.19665271966527198, array([0.81862745, 0.71428571]), array([0.94350282, 0.40322581]), array([0.87664042, 0.51546392])]
In [78]:
# This new model also performs worst.
In [79]:
#Lets create models with all the variables and see if we get improvement
#Decision tree all variables
Eclassifier = DecisionTreeClassifier(criterion="entropy")
Eclassifier.fit(x_train_scaled,classifier_y_train)
y_pred = Eclassifier.predict(x_val_scaled)
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Entropy Decision Tree, all variables  Confusion Matrix')
plt.tight_layout()
In [80]:
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
[0.7698744769874477, 0.2301255230125523, array([0.84269663, 0.55737705]), array([0.84745763, 0.5483871 ]), array([0.84507042, 0.55284553])]
In [81]:
#Even though accuracy increased by 0.01, F1 score decreasd by 0.04 so the model with 4 variables is better here 
In [82]:
#K-nearest neighbors for all variables
classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(x_train_scaled,classifier_y_train)
y_pred = classifier.predict(x_val_scaled)
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('3 neighbors classifier, all variables Confusion Matrix')
plt.tight_layout()
In [83]:
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
[0.7949790794979079, 0.20502092050209209, array([0.83684211, 0.63265306]), array([0.89830508, 0.5       ]), array([0.86648501, 0.55855856])]
In [84]:
# This model is also worse than the original model 
In [85]:
#SVM all variables
classifier = SVC(kernel = 'rbf',C=5.0)
classifier.fit(x_train_scaled,classifier_y_train)
y_pred = classifier.predict(x_val_scaled)

conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('SVM rbf,c=5.0, all variables Confusion Matrix')
plt.tight_layout()
In [86]:
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
[0.8410041841004184, 0.15899581589958156, array([0.83902439, 0.85294118]), array([0.97175141, 0.46774194]), array([0.90052356, 0.60416667])]
In [87]:
# This model is actually better than both other models 
In [88]:
#The best performing model is K nearest neighbors with 3 neighbors on original combination of variables because even though
#there are other models with similiar accuracy K-nearest neighbors has around 0.07 increase in F-score compared to the other models 
In [ ]:
 
In [ ]:
 
In [89]:
#Task 5
In [90]:
# Choosing the variables 
X = merged_data[['Total Population', 'Percent White, not Hispanic or Latino', 'Percent Black, not Hispanic or Latino',"Percent Less than Bachelor's Degree"]]
Y = merged_data['Party']
In [91]:
#Hierarchical Clustering
In [92]:
scaler = StandardScaler() 
scaler.fit(X)
X_scaled = scaler.transform(X) 
In [93]:
clustering = linkage(X_scaled,method = "single",metric = "euclidean")
In [94]:
clusters = fcluster(clustering,2,criterion = 'maxclust')
In [95]:
cont_matrix = metrics.cluster.contingency_matrix(merged_data["Party"],clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()
In [96]:
adjusted_rand_index = metrics.adjusted_rand_score(merged_data['Party'], clusters)
silhouette_coefficient = metrics.silhouette_score(X, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])
[0.005608925119335567, 0.9531008210193291]
In [97]:
# Plot clusters found using hierarchical clustering with single linkage method
# data['clusters'] = clusters
ax = merged_data.plot(kind = 'scatter', x = 'Total Population', y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent White, not Hispanic or Latino', y = 'Percent Black, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = "Percent Less than Bachelor's Degree", c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Total Population', y = "Percent Less than Bachelor's Degree", c = 'Party', colormap = plt.cm.brg)
In [98]:
# Complete Linkage using euclidean
In [99]:
clustering = linkage(X, method = "complete", metric = "euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
In [100]:
cont_matrix = metrics.cluster.contingency_matrix(merged_data['Party'],clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()
In [101]:
adjusted_rand_index = metrics.adjusted_rand_score(merged_data['Party'], clusters)
silhouette_coefficient = metrics.silhouette_score(X, merged_data['Party'], metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])
[0.005608925119335567, 0.4204042259224274]
In [102]:
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
In [103]:
#KMeans Clustering iteration = 1
In [104]:
clustering = KMeans(n_clusters = 2, init = 'random', n_init = 1, random_state = 0).fit(X)
clusters = clustering.labels_
In [105]:
cont_matrix = metrics.cluster.contingency_matrix(merged_data['Party'],clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()
In [106]:
adjusted_rand_index = metrics.adjusted_rand_score(merged_data['Party'], clusters)
silhouette_coefficient = metrics.silhouette_score(X, merged_data['Party'], metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])
[0.11979747814620154, 0.4204042259224274]
In [107]:
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
In [108]:
#KMeans incresing the number of iteration
In [109]:
clustering = KMeans(n_clusters = 2, init = 'k-means++', n_init = 10).fit(X)
# clustering = KMeans(n_clusters = 4, init = 'random', n_init = 20, random_state = 0).fit(X)
clusters = clustering.labels_
In [110]:
cont_matrix = metrics.cluster.contingency_matrix(merged_data['Party'],clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()
In [111]:
adjusted_rand_index = metrics.adjusted_rand_score(merged_data['Party'], clusters)
silhouette_coefficient = metrics.silhouette_score(X, merged_data['Party'], metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])
[0.11979747814620154, 0.4204042259224274]
In [112]:
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
In [113]:
#DBSCAN
In [114]:
clustering = DBSCAN(eps = 1, min_samples = 5, metric = "euclidean").fit(X)
clusters = clustering.labels_
In [115]:
cont_matrix = metrics.cluster.contingency_matrix(merged_data['Party'],clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()
In [116]:
adjusted_rand_index = metrics.adjusted_rand_score(merged_data['Party'], clusters)
silhouette_coefficient = metrics.silhouette_score(X, merged_data['Party'], metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])
[0.0, 0.4204042259224274]
In [117]:
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
In [118]:
# DBSCAN with different eps and min_sample values and metrics
In [119]:
clustering = DBSCAN(eps = 5, min_samples = 500, metric = "manhattan").fit(X)
clusters = clustering.labels_
In [120]:
cont_matrix = metrics.cluster.contingency_matrix(merged_data['Party'],clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()
In [121]:
adjusted_rand_index = metrics.adjusted_rand_score(merged_data['Party'], clusters)
silhouette_coefficient = metrics.silhouette_score(X, merged_data['Party'], metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])
[0.0, 0.4204042259224274]
In [122]:
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
In [ ]:
 
In [123]:
# Evaluation metrics
In [124]:
silhouette_coefficient = metrics.silhouette_score(X, merged_data['Party'], metric = "euclidean")
print(silhouette_coefficient)
0.4204042259224274
In [125]:
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
In [ ]:
 
In [ ]:
 
In [11]:
#Task 6
test_data = pd.read_csv('demographics_test.csv')
new_train = merged_data.drop(['State', 'County', 'FIPS', 'Democratic', 'Republican', 'Party'], axis=1)
new_train_val = merged_data['Party']
scaler = StandardScaler()
scaler.fit(new_train)
new_train_scaled = scaler.transform(new_train)
new_test_data = test_data.drop(['State', 'County', 'FIPS'], axis=1)
scaler = StandardScaler()
scaler.fit(new_test_data)
new_test_data_scaled = scaler.transform(new_test_data)
new_test_data.head()
Out[11]:
Total Population Percent White, not Hispanic or Latino Percent Black, not Hispanic or Latino Percent Hispanic or Latino Percent Foreign Born Percent Female Percent Age 29 and Under Percent Age 65 and Older Median Household Income Percent Unemployed Percent Less than High School Degree Percent Less than Bachelor's Degree Percent Rural
0 1730 98.265896 0.057803 0.462428 0.346821 51.156069 27.109827 15.606936 70000 3.755365 8.415466 83.396513 100.000000
1 12107 5.798299 0.594697 93.326175 9.193029 49.723301 49.302057 12.480383 26639 11.955168 40.840797 90.869691 38.032029
2 25260 73.804434 16.722090 4.441805 2.505938 50.166271 40.186065 11.868567 84342 6.479939 7.152824 65.540254 73.189450
3 805965 66.354867 25.654340 2.890944 5.086945 51.870615 40.779686 14.161657 50399 7.864630 9.873275 64.404446 2.231877
4 29107 63.809393 8.479060 25.502456 9.946061 50.671660 37.351840 17.799842 56681 5.782337 17.579456 79.008391 66.344090
In [12]:
#Train best classification model on whole merged data with the best combination of variables and get prediction of the test data
classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(new_train_scaled[:, [0, 1, 2, 11]],new_train_val)
y_pred = classifier.predict(new_test_data_scaled[:, [0, 1, 2, 11]])
In [13]:
fips = test_data['FIPS'].tolist()
values = y_pred.tolist()

fig = ff.create_choropleth(fips=fips, values=values, colorscale=['rgb(255, 0, 0)', 'rgb(0, 0, 255)'], title='US Counties Political Map', legend_title='1 = Democratic, 0 = Republican')
fig.layout.template = None
fig.show()
In [ ]:
 
In [ ]:
 
In [21]:
# Task 7
In [32]:
merged_data.head()
new_train_scaled
Out[32]:
array([[-1.52852858e-01, -3.06640757e+00, -5.46700242e-01, ...,
         1.29637277e+00,  1.12527371e+00,  5.66216259e-01],
       [ 2.22712198e-02, -1.15599154e+00, -1.99123648e-01, ...,
        -3.41198657e-03, -1.93511628e-01, -6.10414363e-01],
       [ 5.32835921e-02, -1.24105649e+00, -4.54492728e-01, ...,
        -3.65179484e-01, -1.39698044e+00, -7.61076704e-01],
       ...,
       [-2.39218252e-01,  3.47986766e-02, -5.05858154e-01, ...,
        -6.40853821e-01,  1.67498771e-03, -1.40142266e+00],
       [-3.14244542e-01,  4.34968142e-01, -5.78992113e-01, ...,
        -4.77916280e-01,  3.46469490e-01, -3.98681046e-01],
       [-3.53584805e-01,  1.65525491e-01, -5.13989663e-01, ...,
        -1.32947570e-01,  3.38615907e-02, -6.21212750e-01]])
In [33]:
model = linear_model.LinearRegression()
democratic_regression_model = model.fit(new_train_scaled[:, [0, 2, 11]], merged_data['Democratic'])
print(democratic_regression_model.coef_)

democratic_regression_predicted = democratic_regression_model.predict(new_test_data_scaled[:, [0, 2, 11]])
# print(predicted.size)

# corr_coef = numpy.corrcoef(predicted, merged_data['Democratic'])[1, 0] 
# R_squared = corr_coef**2
# print(R_squared)
[64340.66277419  1631.68545204 -7880.35446116]
In [34]:
model = linear_model.LinearRegression()
republican_regression_model = model.fit(new_train_scaled[:, [0, 1, 12]], merged_data['Republican'])
print(republican_regression_model.coef_)

republican_regression_predicted = republican_regression_model.predict(new_test_data_scaled[:, [0, 1, 12]])
# print(predicted.size)

# corr_coef = numpy.corrcoef(predicted, y_val['Republican'])[1, 0] 
# R_squared = corr_coef**2
# print(R_squared)
[39957.51528026  4365.45360822 -5201.53183584]
In [35]:
classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(new_train_scaled[:, [0, 1, 2, 11]],new_train_val)
y_pred = classifier.predict(new_test_data_scaled[:, [0, 1, 2, 11]])
# print(y_pred.size)
In [37]:
names = test_data['State'].tolist()
counties = test_data['County'].tolist()
republican_regression_predicted_list = republican_regression_predicted.tolist()
democratic_regression_predicted_list = democratic_regression_predicted.tolist()
labels_list = y_pred.tolist()

print(names)
print(counties)
print(republican_regression_predicted)
['NV', 'TX', 'VA', 'OH', 'TX', 'MI', 'NM', 'TX', 'NJ', 'PA', 'IN', 'NV', 'NE', 'VA', 'FL', 'MI', 'TX', 'NJ', 'OH', 'OH', 'ND', 'TX', 'NE', 'VA', 'ND', 'UT', 'MN', 'FL', 'TX', 'MT', 'NY', 'TX', 'VA', 'PA', 'MI', 'MI', 'MN', 'ND', 'ND', 'WY', 'MT', 'VT', 'FL', 'AZ', 'TX', 'MA', 'WV', 'TX', 'NY', 'TN', 'TX', 'NJ', 'TX', 'NY', 'TX', 'UT', 'MD', 'FL', 'IN', 'MD', 'MI', 'TX', 'DE', 'VA', 'OH', 'WV', 'NM', 'FL', 'NM', 'UT', 'FL', 'TX', 'MI', 'IN', 'MT', 'FL', 'WV', 'TX', 'WA', 'VA', 'VA', 'MI', 'MA', 'MT', 'TX', 'TN', 'NE', 'WV', 'WY', 'VT', 'WI', 'TX', 'MA', 'FL', 'OH', 'NM', 'NV', 'WI', 'TX', 'WV', 'TX', 'MI', 'UT', 'NY', 'TX', 'VA', 'MN', 'VA', 'NM', 'TX', 'MN', 'MN', 'TX', 'IN', 'WV', 'PA', 'TX', 'MD', 'TN', 'VA', 'MN', 'PA', 'TN', 'TN', 'WI', 'NJ', 'IN', 'TX', 'VA', 'NY', 'NM', 'WY', 'MI', 'TN', 'PA', 'MT', 'UT', 'OH', 'VA', 'FL', 'NY', 'ND', 'IN', 'NE', 'TX', 'MT', 'WA', 'TX', 'TN', 'NE', 'VA', 'NE', 'TX', 'NE', 'PA', 'NJ', 'TX', 'NE', 'OH', 'TX', 'TX', 'NV', 'IN', 'WI', 'NY', 'IN', 'TN', 'MT', 'MN', 'MI', 'NY', 'TX', 'OH', 'WI', 'TN', 'TX', 'MT', 'MD', 'UT', 'WY', 'PA', 'MI', 'NJ', 'WI', 'PA', 'PA', 'IN', 'ND', 'WA', 'TX', 'MD', 'VA', 'VA', 'IN', 'CT', 'PA', 'NY', 'PA', 'WI', 'PA', 'NJ', 'TN', 'PA', 'TX', 'TX', 'TX', 'TN', 'IN', 'OH', 'WY', 'MI', 'IN', 'VA', 'ME', 'TN', 'NE', 'MN', 'NE', 'NY', 'WI', 'WI', 'VA', 'ME', 'FL', 'TX', 'MN', 'TX', 'WY', 'MI', 'FL', 'WV', 'PA', 'WI', 'WI', 'TN', 'NY', 'MT', 'NY', 'MN', 'ND', 'WY', 'NE', 'TX', 'ND', 'OH', 'WV', 'NE', 'NM', 'NY', 'NM', 'FL', 'TX', 'MI', 'NE', 'VA', 'OH', 'UT', 'NJ', 'TN', 'UT', 'TX', 'TX', 'FL', 'PA', 'TX', 'CT', 'TX', 'NE', 'MI', 'ND', 'TX', 'WI', 'VA', 'ND', 'VA', 'MN', 'TX', 'FL', 'TX', 'MN', 'TX', 'IN', 'WI', 'MI', 'ME', 'TX', 'MD', 'OH', 'UT', 'UT', 'VA', 'NE', 'PA', 'MI', 'TX', 'NE', 'ND', 'IN', 'ME', 'AZ', 'OH', 'VA', 'MT', 'WI', 'ND', 'TX', 'MN', 'WV', 'MN', 'VA', 'NY', 'MI', 'TX', 'MD', 'OH', 'TX', 'TN', 'NM', 'NJ', 'ND', 'NE', 'WA', 'WY', 'TX', 'VT', 'OH', 'MN', 'NE', 'VA', 'WV', 'MN', 'MN', 'TX', 'NE', 'TX', 'MA', 'UT', 'TX', 'VA', 'ME', 'MT', 'WV', 'TX', 'VA', 'IN', 'MA', 'VA', 'NE', 'TX', 'IN', 'ND', 'NY', 'NM', 'TN', 'OH', 'IN', 'IN', 'NE', 'ND', 'WV', 'TX', 'WA', 'NM', 'TX', 'WV', 'ND', 'MT', 'VA', 'VA', 'OH', 'TN', 'NE', 'TX', 'MI', 'VA', 'IN', 'NY', 'VA', 'WI', 'ND', 'WI', 'IN', 'OH', 'UT', 'MN', 'MI', 'MN', 'TX', 'TN', 'TX', 'PA', 'FL', 'WA', 'OH', 'TX', 'VT', 'OH', 'NE', 'MD', 'NY']
['eureka', 'zavala', 'king george', 'hamilton', 'austin', 'barry', 'valencia', 'ellis', 'mercer', 'cambria', 'switzerland', 'lander', 'cherry', 'radford city', 'lee', 'arenac', 'shackelford', 'gloucester', 'trumbull', 'lawrence', 'burke', 'hardeman', 'keya paha', 'norton city', 'bowman', 'duchesne', 'carlton', 'okaloosa', 'oldham', 'lewis and clark', 'rockland', 'waller', 'falls church city', 'potter', 'gratiot', 'shiawassee', 'polk', 'billings', 'mckenzie', 'weston', 'jefferson', 'bennington', 'clay', 'yuma', 'terrell', 'bristol', 'webster', 'collingsworth', 'chautauqua', 'fayette', 'bandera', 'union', 'sherman', 'seneca', 'martin', 'tooele', 'talbot', 'highlands', 'gibson', 'allegany', 'mecosta', 'jasper', 'new castle', 'warren', 'ashtabula', 'ohio', 'bernalillo', 'calhoun', 'torrance', 'box elder', 'columbia', 'edwards', 'iron', 'bartholomew', 'treasure', 'liberty', 'brooke', 'ward', 'skamania', 'caroline', 'floyd', 'dickinson', 'dukes', 'richland', 'trinity', 'clay', 'howard', 'putnam', 'laramie', 'lamoille', 'green', 'childress', 'hampden', 'hardee', 'allen', 'colfax', 'pershing', 'vilas', 'gillespie', 'cabell', 'cameron', 'clare', 'davis', 'dutchess', 'motley', 'henrico', 'grant', 'rappahannock', 'taos', 'smith', 'yellow medicine', 'pipestone', 'somervell', 'benton', 'hampshire', 'columbia', 'ector', 'howard', 'jefferson', 'manassas city', 'scott', 'northampton', 'fentress', 'dyer', 'burnett', 'monmouth', 'brown', 'chambers', 'roanoke city', 'putnam', 'san juan', 'crook', 'ingham', 'obion', 'franklin', 'lake', 'cache', 'fayette', 'accomack', 'pasco', 'nassau', 'towner', 'kosciusko', 'colfax', 'frio', 'madison', 'island', 'andrews', 'hardin', 'dundy', 'chesapeake city', 'hayes', 'parmer', 'buffalo', 'carbon', 'atlantic', 'kimble', 'seward', 'lake', 'tom green', 'mason', 'humboldt', 'noble', 'dunn', 'lewis', 'orange', 'crockett', 'beaverhead', 'le sueur', 'berrien', 'otsego', 'matagorda', 'marion', 'washburn', 'cocke', 'orange', 'yellowstone', 'carroll', 'wayne', 'park', 'pike', 'lenawee', 'burlington', 'iowa', 'huntingdon', 'berks', 'knox', 'ramsey', 'klickitat', 'howard', "queen anne's", 'buckingham', 'rockingham', 'henry', 'litchfield', 'erie', 'albany', 'bucks', 'buffalo', 'mckean', 'cape may', 'haywood', 'fulton', 'jim wells', 'lee', 'williamson', 'knox', 'putnam', 'noble', 'big horn', 'ionia', 'daviess', 'cumberland', 'waldo', 'scott', 'stanton', 'sherburne', 'boyd', 'orleans', 'vernon', 'oconto', 'louisa', 'aroostook', 'sumter', 'newton', 'cook', 'brooks', 'hot springs', 'kalkaska', 'levy', 'harrison', 'butler', 'price', 'shawano', 'carter', 'genesee', 'missoula', 'jefferson', 'itasca', 'williams', 'sheridan', 'garden', 'coleman', 'ward', 'wood', 'wirt', 'box butte', 'los alamos', 'bronx', 'sierra', 'palm beach', 'haskell', 'midland', 'blaine', 'bedford', 'paulding', 'millard', 'essex', 'humphreys', 'washington', 'briscoe', 'wise', 'volusia', 'schuylkill', 'willacy', 'new london', 'kinney', 'saline', 'tuscola', 'slope', 'val verde', 'fond du lac', 'powhatan', 'richland', 'york', 'dodge', 'dickens', 'jefferson', 'titus', 'lyon', 'hall', 'spencer', 'marquette', 'allegan', 'kennebec', 'hidalgo', 'montgomery', 'wyandot', 'summit', 'rich', 'clarke', 'kearney', 'mifflin', 'charlevoix', 'walker', 'thurston', 'traill', 'clinton', 'sagadahoc', 'greenlee', 'meigs', 'hanover', 'chouteau', 'sauk', 'mchenry', 'concho', 'todd', 'mingo', 'winona', 'rockbridge', 'rensselaer', 'isabella', 'wharton', 'harford', 'van wert', 'madison', 'anderson', 'lincoln', 'morris', 'oliver', 'wayne', 'asotin', 'teton', 'erath', 'grand isle', 'morrow', 'kittson', 'holt', 'petersburg city', 'hancock', 'redwood', 'hennepin', 'hansford', 'greeley', 'rains', 'worcester', 'weber', 'delta', 'waynesboro city', 'piscataquis', 'petroleum', 'logan', 'jefferson', 'virginia beach city', 'vermillion', 'berkshire', 'prince edward', 'sioux', 'presidio', 'jennings', 'sioux', 'madison', 'sandoval', 'weakley', 'montgomery', 'miami', 'newton', 'sherman', 'stark', 'pleasants', 'panola', 'whitman', 'hidalgo', 'ochiltree', 'nicholas', 'divide', 'garfield', 'gloucester', 'fauquier', 'henry', 'union', 'deuel', 'gregg', 'kent', 'fairfax city', 'elkhart', 'new york', 'giles', 'polk', 'bottineau', 'richland', 'wabash', 'champaign', 'emery', 'olmsted', 'kalamazoo', 'marshall', 'roberts', 'trousdale', 'kleberg', 'blair', 'hernando', 'kitsap', 'franklin', 'cass', 'chittenden', 'butler', 'franklin', 'cecil', 'yates']
[-1.58569822e+03 -1.00216959e+04  1.60709519e+03  1.53735715e+05
  1.21620726e+03  1.17651746e+04  1.11142118e+04  3.04459845e+04
  7.08948414e+04  3.27594097e+04 -3.99176693e+02  1.89658940e+03
  4.55929213e+03  1.30940618e+04  1.30935318e+05  1.99905072e+02
 -3.99910977e+03  6.15551489e+04  4.45705587e+04  1.70637394e+04
 -2.51866702e+03 -8.04203013e+03 -1.89189188e+03  1.14506120e+04
 -2.44505372e+03  4.00050100e+03  9.43867929e+03  4.25502930e+04
 -6.91050194e+03  1.98604231e+04  6.50545542e+04  7.59822626e+02
  1.05306978e+04  9.11925162e+02  9.46325626e+03  1.68776809e+04
  9.80619900e+03 -2.48825521e+03 -4.40075738e+03  5.19933280e+03
 -8.76351504e+02  9.56671053e+03  4.29228613e+04  3.50540287e+04
 -1.68189234e+04  1.10424707e+05 -8.12482792e+00 -9.99949536e+03
  2.87210181e+04  1.98531466e+03 -2.19485225e+03  1.01987143e+05
 -1.03421467e+04  9.04326617e+03 -1.08067732e+04  1.91112769e+04
  7.66871085e+03  2.21222518e+04  1.06502289e+04  2.04748541e+04
  9.75397451e+03  2.88732220e+03  1.05416770e+05  1.07008386e+04
  2.28118368e+04  1.67397505e+04  1.23739913e+05  1.27416295e+03
 -8.58814123e+03  1.59323965e+04  1.10767063e+04 -1.29235835e+04
  3.76216277e+03  1.99946480e+04 -3.21341070e+03 -5.70568030e+03
  1.09095883e+04 -6.91875493e+02 -1.99600569e+03 -6.30279506e+02
 -3.72664595e+01  1.27920862e+04  8.29568973e+03  7.09880819e+03
  1.86776937e+02 -1.20700724e+03 -1.27111731e+03  1.78947163e+04
  2.40843329e+04  1.96311938e+03  1.03075680e+04  2.67814762e+03
  9.06598710e+04  2.67488748e+02  2.51767812e+04 -3.13272117e+03
 -7.60993879e+03 -8.35098502e+02  5.29189213e+03  2.62627520e+04
  6.97599229e+04  7.58647507e+03  7.08071042e+04  5.82647565e+04
 -4.28255499e+03  6.24931792e+04 -1.33986626e+03 -2.45585769e+03
 -2.96211592e+03  4.08231725e+04  1.27367741e+03  4.74210322e+03
 -4.50696803e+03 -1.48044618e+03  1.91354760e+03  1.85065375e+04
  2.66934231e+04  5.93246858e+04  1.28557886e+04  9.37781325e+03
  3.31775986e+04  6.23385254e+04  1.07659638e+03  1.00875372e+04
 -7.41792149e+02  1.22919567e+05  3.37647201e+02  7.00637776e+03
  2.35953253e+04  2.48778416e+04  1.84068953e+04 -8.99564231e+02
  5.77389190e+04  6.60711660e+03  3.33037991e+04 -9.70478376e+02
  2.98735157e+04  1.03940884e+04 -4.01218685e+03  9.69293981e+04
  2.53812649e+05 -4.10022859e+03  1.87380513e+04 -5.92302515e+02
 -3.99920785e+03 -1.34333638e+03  1.72295661e+04  2.13022238e+03
  6.51385648e+03 -2.72453230e+03  4.66138512e+04 -2.62303147e+03
 -7.39401776e+03  1.52307815e+04  1.68936928e+04  5.28215987e+04
  2.79801502e+03  6.77652442e+03  5.26923247e+04  2.31836443e+04
 -6.93834762e+03  3.82734383e+03  9.52828803e+03  1.14796596e+04
  4.60013588e+03  3.71864955e+03  8.29747853e+02  6.13969107e+03
  7.65720774e+03  3.19610761e+04  1.24377041e+04  3.30230464e+03
  1.89526864e+04  2.78836799e+03  8.60455596e+03  1.99574486e+04
  3.71116595e+04  3.64106976e+04 -2.69525276e+03  1.04602916e+04
  9.33452959e+03  2.12449620e+04  8.85376783e+04  5.06170853e+03
  9.81408649e+03  8.04404197e+04  1.38735731e+04  7.52906599e+03
  4.67114693e+03  6.96156588e+03  1.16448338e+04 -6.86324707e+03
  1.69777471e+04  1.50449736e+04  3.92651129e+04  5.87531596e+04
  6.31268023e+04  1.23947763e+05  5.84055848e+01  1.05620917e+04
  2.57252710e+04 -1.97735732e+03  3.90562193e+02 -2.19362011e+03
 -1.55615913e+03  9.35714981e+04  9.05817303e+04  9.10376572e+03
  5.10984710e+03 -1.95594344e+03  1.38350257e+04  9.03899409e+03
 -7.14568370e+03  6.08692570e+03  5.02458543e+03  2.03863780e+03
  2.21741424e+04 -2.03371039e+03  9.37111490e+03  5.49523758e+03
  7.22876161e+03 -1.28002622e+02  1.30688738e+04  2.60638543e+04
 -4.66814542e+03 -3.55251706e+03 -1.00363890e+04  8.79435673e+03
  2.95725968e+03  2.74212461e+03  1.96877960e+04  4.04761557e+04
 -3.84099062e+01  7.30429055e+03  1.70255573e+04  1.35575003e+04
  2.91174420e+04  2.43616736e+04  8.29082273e+03  1.15513698e+04
  1.22268935e+04 -2.70978224e+03  2.76726213e+03  2.00313248e+04
  3.07020979e+04 -1.03021677e+03  8.46901801e+03  9.87850187e+03
  2.57077277e+05  3.58403785e+03  2.60205939e+05  5.84082498e+02
  2.09045259e+04 -1.71908133e+03  1.33857606e+04  3.17709483e+03
  1.38044724e+03  1.44099175e+05  3.06401409e+03  3.62675886e+04
 -9.28286054e+03  9.41429348e+03  1.00168258e+05  3.32624991e+04
 -6.53134277e+03  5.46719283e+04 -2.00003660e+03  2.66468842e+03
  9.46808248e+03 -2.86921370e+03  3.35280602e+03  2.52244290e+04
  2.14106748e+02  7.26392676e+03  1.94097726e+04  8.14625017e+03
 -8.15900353e+03 -8.03059465e+03  6.79305021e+02  8.29518745e+03
 -1.01588583e+04  1.24746470e+03 -7.31963428e+01  2.23994464e+04
  2.53013134e+04  1.44508028e+05  1.89570329e+05  8.22565146e+03
  1.18210657e+04 -3.28803183e+03  3.13853538e+03  5.31745353e+03
  1.39196182e+04  6.59499894e+03  1.04998232e+04 -1.38149779e+04
 -1.54078532e+03  8.73031539e+03  9.76696638e+03 -2.79857116e+03
  4.99203824e+03  2.33464956e+04 -5.58267750e+03  1.66193256e+04
 -1.53323036e+03 -1.31548409e+04  4.54514506e+03  3.88191130e+03
  1.62432264e+04  2.40357376e+03  3.51531509e+04  1.66984080e+04
  2.12461017e+03  5.20680821e+04  1.01849149e+04 -3.42821671e+03
  2.02819891e+04  1.57732709e+03  9.81240307e+04 -2.99024954e+03
  7.13674771e+03  1.52443027e+04  6.90283303e+03  9.05745108e+03
 -1.71863325e+03  5.80086244e+03 -1.52566756e+03  4.63657586e+03
  1.20985691e+03  1.35926898e+04  3.47456768e+03  2.28239687e+05
 -1.27901859e+03 -1.88409468e+03 -1.47617557e+03  1.55243814e+05
  5.23191736e+04 -4.60409663e+03  1.27074881e+04  6.23151393e+02
 -1.44532252e+03  8.59814681e+03  4.65215801e+04  8.79938843e+04
  6.88911495e+03  3.03770702e+04  2.61042460e+02 -2.94116256e+03
 -9.35963167e+03  8.82600304e+03 -1.96088006e+04  1.66481470e+04
  2.47769054e+04  7.07803372e+03  1.04636955e+05  1.11714871e+04
 -7.07913312e+02 -1.62717151e+03  1.29141417e+04  6.15453628e+03
  1.37486093e+03  1.40960954e+04 -1.33049647e+04  2.03286532e+03
  4.60840958e+03 -2.35409041e+03 -1.49589411e+03  7.55751041e+03
  1.34829440e+04  6.34575659e+03  1.23784849e+03 -2.98779638e+03
  2.56247834e+04  1.21185366e+05  9.54598131e+03  4.24847751e+04
  3.01404325e+05  5.77452341e+03  7.69167847e+03 -1.95122977e+03
  5.02769941e+03  1.09953859e+04  8.78191664e+03  2.66718953e+03
  3.49633915e+04  5.39284542e+04 -1.13506258e+03 -2.25367250e+03
 -3.17583404e+03 -2.81995962e+01  3.23599652e+04  3.90602561e+04
  5.37067892e+04  2.31480821e+05  3.00649439e+03  3.71060940e+04
  7.71549435e+04 -1.71716321e+03  2.31602344e+04  6.53475279e+03]
In [38]:
output_data = {'State':names,'County':counties,'Democratic':democratic_regression_predicted_list,'Republican':republican_regression_predicted_list,'Party':labels_list}
In [41]:
output_dataFrame = pd.DataFrame(output_data,columns =['State','County','Democratic','Republican','Party'])
In [42]:
output_dataFrame.head()
Out[42]:
State County Democratic Republican Party
0 NV eureka -11848.191722 -1585.698217 0
1 TX zavala -14554.521492 -10021.695886 1
2 VA king george 12367.465717 1607.095192 1
3 OH hamilton 244735.384266 153735.714839 1
4 TX austin 1307.715620 1216.207262 0
In [43]:
output_dataFrame.to_csv("Output.csv")
In [ ]: